/*    Copyright 2012 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.IIDTextModel;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.paa.apps.MassOccurrenceDAA;
import mosdi.util.Alphabet;
import mosdi.util.InvalidInputFileException;
import mosdi.util.Log;
import mosdi.util.SequenceUtils;

public class MassProbabilitySubcommand extends Subcommand {

	@Override
	public String usage() {
		return
		super.usage()+" [options] <length-dist> <min-mass> <max-mass>\n" +
		"\n" +
		"Computes the probability that a random protein contains a fragment\n" +
		"(w.r.t. to trypsin cleavage) whose mass is between <min-mass> and\n" +
		"<max-mass>. The length of the random protein is assumed to follow\n" +
		"the distribution given in <length-dist> (such a file can be created\n" +
		"using \"mosdi-utils length-histogram\").\n" +
		"\n" +
		"Options:\n" +
		"  -q <q-gram-table-file>: Estimate text model from given q-gram table (default: uniform i.i.d.)\n" +
		"                          A q-gram table can be created by using \"mosdi-utils count-qgrams\".\n" +
		"  -a <accuracy>: accuracy used to discretize masses. A value of 0.1,\n" +
		"                 for instance, means that masses are handled in discrete\n" +
		"                 units of 0.1Da. (Default: 0.1)";
	}

	@Override
	public String description() {
		return "Computes the probability of a fragment mass occurring within a random protein.";
	}

	@Override
	public String name() {
		return "mass-prob";
	}

	private class LengthAndProbability {
		int length;
		double probability;
		private LengthAndProbability(int length, double probability) {
			this.length = length;
			this.probability = probability;
		}
	}
	
	private List<LengthAndProbability> parseLengthDistribution(String filename) throws FileNotFoundException, IOException, InvalidInputFileException {
		ArrayList<LengthAndProbability> result = new ArrayList<LengthAndProbability>();
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
		int lineNr = 0;
		double sum = 0.0;
		while (true) {
			String line = br.readLine();
			lineNr += 1;
			if (line==null) break;
			line = line.trim();
			StringTokenizer tokenizer = new StringTokenizer(line, " \t");
			try {
				int length = Integer.parseInt(tokenizer.nextToken());
				double probability = Double.parseDouble(tokenizer.nextToken());
				result.add(new LengthAndProbability(length,probability));
				sum += probability;
				if (tokenizer.hasMoreTokens()) throw new InvalidInputFileException();
			} catch (Exception e) {
				throw new InvalidInputFileException(String.format("Invalid input file \"%s\". Offending line: %d", filename, lineNr));
			}
		}
		// Normalize distribution to one
		for (LengthAndProbability e : result) {
			e.probability /= sum;
		}
		return result;
	}
	
	@Override
	public int run(String[] args) {
		parseOptions(args, 3, "q:a:");

		// Option dependencies
		// -- NONE --

		// Mandatory arguments
		String lengthDistFilename = getStringArgument(0);
		double minMass = getDoubleArgument(1);
		double maxMass = getDoubleArgument(2);

		// Options
		double massAccuracy = getPositiveDoubleOption("a", 0.1);
		String qGramTableFilename = getStringOption("q", null);

		if ((minMass < 0.0) || (minMass >= maxMass)) {
			Log.errorln("Invalid mass range.");
			return 1;
		}
		
		Alphabet alphabet = Alphabet.getAminoAcidAlphabet();
		
		// Create text model
		FiniteMemoryTextModel textModel = null;
		if (qGramTableFilename == null) {
			textModel = new IIDTextModel(alphabet.size());
		} else {
			try {
				textModel = SequenceUtils.buildTextModelFromQGramFile(qGramTableFilename, alphabet);
			} catch (Exception e) {
				Log.errorln(e.toString());
				System.exit(1);
			}
		}
		// Read length distribution
		List<LengthAndProbability> lengthDistribution = null;
		try {
			lengthDistribution = parseLengthDistribution(lengthDistFilename);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		Log.startTimer();
		MassOccurrenceDAA daa = new MassOccurrenceDAA(massAccuracy, minMass, maxMass);
		int[] targetValues = daa.massFoundValues();
		TextBasedPAA paa = new TextBasedPAA(daa, textModel);
		double[][] stateValueDistribution = paa.stateValueStartDistribution();
		int n = 0;
		double totalProbability = 0.0;
		for (LengthAndProbability e : lengthDistribution) {
			// update joint distribution until it reflects the currently considered
			// protein length
			while (n < e.length) {
				Log.printf(Log.Level.VERBOSE, "Considering length %d\n", n);
				stateValueDistribution = paa.updateStateValueDistribution(stateValueDistribution);
				n += 1;
			}
			// compute the mass occurrence probability:
			// sum over all states for all target values
			double p = 0.0;
			for (int value : targetValues) {
				for (int state = 0; state < paa.getStateCount(); ++state) {
					p += stateValueDistribution[state][value];
				}
			}
			// add contribution made by this protein length 
			totalProbability += e.probability * p;
		}
		Log.stopTimer("Computing mass occurrence probability");
		double time = Log.getLastPeriodCpu();
		Log.printf(Log.Level.STANDARD, ">> %e %f %f %e >>time>> %t\n", totalProbability, minMass, maxMass, massAccuracy, time);
		
		return 0;
	}
}
